########################
# Estimate Single Bart
# And produce Inclusion
# Probability Plot
########################

## Set RNG 
set.seed(831213)

## Get processed data
load("./FullData.RData")

## Get best parameters from parameter sweep
bestParams <- read.csv("./BSParamSweep.csv"
                       ,nrows=1
)[,1:6]

## Estimate BART with all observations and all features
# thisBartFull <- bart(x.train=all.data$fData[,-grep("fraud|uid|train", colnames(all.data$fData))]
#                      ,y.train=all.data$fData[, "fraud.score"],
#                      ,k = bestParams$k
#                      ,power = bestParams$pow
#                      ,base = bestParams$base
#                      ,sigdf = bestParams$sigdf
#                      ,sigquant = bestParams$sigquant
#                      ,sigest=0.3
#                      ,nskip = 50e3
#                      ,keepevery=10
#                      ,ndpost = 5e3
#                      ,ntree = 100)
# 
# ## Save BART predictions so they can be used 
# ## by external validity script
# save(thisBartFull, file="./FullBart.RData")
load("./FullBart.RData")

## Get nr. of variables (a.k.a. features) used
numVars <- length(colMeans(thisBartFull$varcount))

## Calculate relative frequency of feature usage
## in splitting rules, and sort
values <- colMeans(prop.table(thisBartFull$varcount,1))
ordering <- order(values)
names <- colnames(all.data$fData[,-grep("fraud|uid|train", colnames(all.data$fData))])

## Assign more legible names to variables
names <- c("Uniform last digit",
           "Distance last pair",
           "Mean 2nd digit",
           "Benford 2nd digit",
           "Ethnolinguistic fract.",           
           "Urbanization",
           "Turnout",
           "Regime type (anocracy)",
           "Regime type (autocracy)",
           "Regime type (missing)",
           "Regime type (new democracy)",
           "Regime  type (old democracy)",
           "Inequality (1st quart.)",
           "Inequality (2nd quart.)",
           "Inequality (3rd quart.)",
           "Inequality (4th quart.)",
           "Inequality (missing)",
           "Average district magnitude (1st quart.)",
           "Average district magnitude (2nd quart.)",
           "Average. district magnitude (3rd quart.)",
           "Average district magnitude (4th quart.)",
           "Average district magnitude (missing)",
           "Change in GDP (1st quart.)",
           "Change in GDP (2nd quart.)",
           "Change in GDP (3rd quart.)",
           "Change in GDP (4th quart.)",
           "Change in GDP (missing)",
           "Gov. Electoral Commission",
           "Mixed Electoral Commission",
           "Indep. Electoral Commission",
           "Regime Instability")

## Create Graph
postscript("./InclusionProbs.ps")
par(yaxt="n", mfrow=c(1,1), mar=c(2.5,14,.5,.5), bty="n", mgp=c(1,0,0), tcl=0)
plot(values[ordering], 1:numVars, ylab="", pch=19, main=""
     , xlab="Posterior average inclusion rate"
     ,xlim=c(0.02,0.065))
par(las=1)
mtext(names[ordering], at=1:numVars, cex=.8, side=2, line=.25)
for(i in 1:numVars){
  abline(h=i, lty=2, col="gray80")
}
points(values[ordering], 1:numVars, ylab="", pch=19)
dev.off()




